In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, median_absolute_error
from scipy.stats import uniform, randint
import warnings
warnings.simplefilter("ignore")
In [3]:
mobi_2021 = pd.read_excel(r"C:\Users\devin\OneDrive\Bureau\Data_Pompier\Mobilisation\LFB Mobilisation data 2021 - 2024.xlsx")
In [4]:
incident_2018_onwards= pd.read_excel(r"C:\Users\devin\OneDrive\Bureau\Data_Pompier\Incidents\LFB Incident data from 2018 onwards.csv.xlsx")
In [5]:
gps_station = pd.read_csv(r"C:\Users\devin\OneDrive\Bureau\Data_Pompier\GPS Station.csv", sep=";",encoding="ISO-8859-1")

Mobilisation 2021¶

In [10]:
mobi_2021.head()
Out[10]:
IncidentNumber CalYear HourOfCall ResourceMobilisationId Resource_Code PerformanceReporting DateAndTimeMobilised DateAndTimeMobile DateAndTimeArrived TurnoutTimeSeconds ... DateAndTimeLeft DateAndTimeReturned DeployedFromStation_Code DeployedFromStation_Name DeployedFromLocation PumpOrder PlusCode_Code PlusCode_Description DelayCodeId DelayCode_Description
0 000004-01012021 2021 0 5769249 A321 1 2021-01-01 00:06:47 2021-01-01 00:07:35 2021-01-01 00:09:48 48.0 ... 2021-01-01 00:57:30 NaN A32 Hornsey Home Station 1 Initial Initial Mobilisation NaN NaN
1 000005-01012021 2021 0 5769250 F351 1 2021-01-01 00:07:47 2021-01-01 00:09:41 2021-01-01 00:11:57 114.0 ... 2021-01-01 00:18:39 NaN F35 Woodford Home Station 1 Initial Initial Mobilisation NaN NaN
2 000006-01012021 2021 0 5769251 F412 1 2021-01-01 00:08:21 2021-01-01 00:10:32 2021-01-01 00:14:37 131.0 ... 2021-01-01 00:24:43 NaN F41 Dagenham Home Station 1 Initial Initial Mobilisation 12.0 Not held up
3 000007-01012021 2021 0 5769252 H331 1 2021-01-01 00:12:23 2021-01-01 00:13:16 2021-01-01 00:19:12 53.0 ... 2021-01-01 00:40:49 NaN H33 Wandsworth Home Station 1 Initial Initial Mobilisation 8.0 Traffic calming measures
4 000007-01012021 2021 0 5769253 G351 2 2021-01-01 00:12:23 2021-01-01 00:13:32 2021-01-01 00:19:48 69.0 ... 2021-01-01 00:29:54 NaN G35 Fulham Home Station 2 Initial Initial Mobilisation NaN NaN

5 rows × 22 columns

In [11]:
mobi_2021.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 587919 entries, 0 to 587918
Data columns (total 22 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   IncidentNumber            587919 non-null  object        
 1   CalYear                   587919 non-null  int64         
 2   HourOfCall                587919 non-null  int64         
 3   ResourceMobilisationId    587919 non-null  int64         
 4   Resource_Code             587919 non-null  object        
 5   PerformanceReporting      587919 non-null  object        
 6   DateAndTimeMobilised      587919 non-null  datetime64[ns]
 7   DateAndTimeMobile         585390 non-null  datetime64[ns]
 8   DateAndTimeArrived        587919 non-null  datetime64[ns]
 9   TurnoutTimeSeconds        585372 non-null  float64       
 10  TravelTimeSeconds         585374 non-null  float64       
 11  AttendanceTimeSeconds     587919 non-null  int64         
 12  DateAndTimeLeft           587696 non-null  datetime64[ns]
 13  DateAndTimeReturned       0 non-null       float64       
 14  DeployedFromStation_Code  587915 non-null  object        
 15  DeployedFromStation_Name  587915 non-null  object        
 16  DeployedFromLocation      587476 non-null  object        
 17  PumpOrder                 587919 non-null  int64         
 18  PlusCode_Code             587919 non-null  object        
 19  PlusCode_Description      587919 non-null  object        
 20  DelayCodeId               145365 non-null  float64       
 21  DelayCode_Description     145365 non-null  object        
dtypes: datetime64[ns](4), float64(4), int64(5), object(9)
memory usage: 98.7+ MB
In [12]:
mobi_2021.describe()
Out[12]:
CalYear HourOfCall ResourceMobilisationId DateAndTimeMobilised DateAndTimeMobile DateAndTimeArrived TurnoutTimeSeconds TravelTimeSeconds AttendanceTimeSeconds DateAndTimeLeft DateAndTimeReturned PumpOrder DelayCodeId
count 587919.000000 587919.000000 5.879190e+05 587919 585390 587919 585372.000000 585374.000000 587919.000000 587696 0.0 587919.000000 145365.000000
mean 2022.258306 13.335888 6.203947e+06 2022-09-28 00:51:11.930602496 2022-09-27 23:32:29.076929536 2022-09-28 00:57:07.751988224 73.693889 283.196083 355.821385 2022-09-28 01:44:28.466557440 NaN 1.538125 10.654160
min 2021.000000 0.000000 5.769249e+06 2021-01-01 00:06:47 2021-01-01 00:07:35 2021-01-01 00:09:48 0.000000 0.000000 0.000000 2021-01-01 00:18:39 NaN 1.000000 3.000000
25% 2021.000000 9.000000 5.983400e+06 2021-12-04 20:55:00 2021-12-04 21:18:41.500000 2021-12-04 21:00:27.500000 51.000000 183.000000 256.000000 2021-12-04 21:38:54.500000 NaN 1.000000 9.000000
50% 2022.000000 14.000000 6.204396e+06 2022-10-06 19:11:00 2022-10-06 17:52:46 2022-10-06 19:15:42 70.000000 259.000000 333.000000 2022-10-06 20:10:27.500000 NaN 1.000000 12.000000
75% 2023.000000 19.000000 6.425224e+06 2023-07-28 20:08:34.500000 2023-07-28 19:06:01.750000128 2023-07-28 20:12:34 90.000000 355.000000 430.000000 2023-07-28 20:41:40 NaN 2.000000 12.000000
max 2024.000000 23.000000 6.645201e+06 2024-04-29 23:59:36 2024-04-30 00:02:20 2024-04-30 00:04:26 1192.000000 1180.000000 1200.000000 2024-04-30 01:33:08 NaN 13.000000 13.000000
std 0.971986 6.407685 2.542291e+05 NaN NaN NaN 40.431050 149.259150 151.999921 NaN NaN 0.920319 2.015382
In [13]:
(mobi_2021.isnull().sum()/len(mobi_2021))*100
Out[13]:
IncidentNumber                0.000000
CalYear                       0.000000
HourOfCall                    0.000000
ResourceMobilisationId        0.000000
Resource_Code                 0.000000
PerformanceReporting          0.000000
DateAndTimeMobilised          0.000000
DateAndTimeMobile             0.430161
DateAndTimeArrived            0.000000
TurnoutTimeSeconds            0.433223
TravelTimeSeconds             0.432883
AttendanceTimeSeconds         0.000000
DateAndTimeLeft               0.037930
DateAndTimeReturned         100.000000
DeployedFromStation_Code      0.000680
DeployedFromStation_Name      0.000680
DeployedFromLocation          0.075351
PumpOrder                     0.000000
PlusCode_Code                 0.000000
PlusCode_Description          0.000000
DelayCodeId                  75.274655
DelayCode_Description        75.274655
dtype: float64
In [14]:
mobi_2021.duplicated().sum()
Out[14]:
0
In [6]:
mobi_2021 = mobi_2021.drop(columns = "DateAndTimeReturned")
In [16]:
print(mobi_2021.columns)
Index(['IncidentNumber', 'CalYear', 'HourOfCall', 'ResourceMobilisationId',
       'Resource_Code', 'PerformanceReporting', 'DateAndTimeMobilised',
       'DateAndTimeMobile', 'DateAndTimeArrived', 'TurnoutTimeSeconds',
       'TravelTimeSeconds', 'AttendanceTimeSeconds', 'DateAndTimeLeft',
       'DeployedFromStation_Code', 'DeployedFromStation_Name',
       'DeployedFromLocation', 'PumpOrder', 'PlusCode_Code',
       'PlusCode_Description', 'DelayCodeId', 'DelayCode_Description'],
      dtype='object')

Incident 2018¶

In [17]:
incident_2018_onwards.head()
Out[17]:
IncidentNumber DateOfCall CalYear TimeOfCall HourOfCall IncidentGroup StopCodeDescription SpecialServiceType PropertyCategory PropertyType ... FirstPumpArriving_AttendanceTime FirstPumpArriving_DeployedFromStation SecondPumpArriving_AttendanceTime SecondPumpArriving_DeployedFromStation NumStationsWithPumpsAttending NumPumpsAttending PumpCount PumpMinutesRounded Notional Cost (£) NumCalls
0 000008-01012018 2018-01-01 2018 00:04:25 0 False Alarm AFA NaN Non Residential Mosque ... 348.0 Finchley NaN NaN 1.0 1.0 1 60 328 1.0
1 000009-01012018 2018-01-01 2018 00:04:30 0 False Alarm AFA NaN Non Residential Pub/wine bar/bar ... 144.0 Beckenham NaN NaN 1.0 1.0 1 60 328 1.0
2 000010-01012018 2018-01-01 2018 00:04:34 0 Fire Secondary Fire NaN Outdoor Structure Common external bin storage area ... 232.0 Southgate NaN NaN 1.0 1.0 1 60 328 1.0
3 000011-01012018 2018-01-01 2018 00:04:58 0 Special Service Special Service RTC Road Vehicle Multiple Vehicles ... 22.0 Enfield NaN NaN 1.0 1.0 1 60 328 1.0
4 000014-01012018 2018-01-01 2018 00:07:47 0 Fire Primary Fire NaN Road Vehicle Car ... 241.0 Stratford NaN NaN 1.0 1.0 1 60 328 6.0

5 rows × 39 columns

In [18]:
incident_2018_onwards.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 713368 entries, 0 to 713367
Data columns (total 39 columns):
 #   Column                                  Non-Null Count   Dtype         
---  ------                                  --------------   -----         
 0   IncidentNumber                          713368 non-null  object        
 1   DateOfCall                              713368 non-null  datetime64[ns]
 2   CalYear                                 713368 non-null  int64         
 3   TimeOfCall                              713368 non-null  object        
 4   HourOfCall                              713368 non-null  int64         
 5   IncidentGroup                           713368 non-null  object        
 6   StopCodeDescription                     713368 non-null  object        
 7   SpecialServiceType                      249623 non-null  object        
 8   PropertyCategory                        713368 non-null  object        
 9   PropertyType                            713368 non-null  object        
 10  AddressQualifier                        713368 non-null  object        
 11  Postcode_full                           321180 non-null  object        
 12  Postcode_district                       713368 non-null  object        
 13  UPRN                                    713368 non-null  int64         
 14  USRN                                    713368 non-null  int64         
 15  IncGeo_BoroughCode                      713368 non-null  object        
 16  IncGeo_BoroughName                      713368 non-null  object        
 17  ProperCase                              713368 non-null  object        
 18  IncGeo_WardCode                         713034 non-null  object        
 19  IncGeo_WardName                         713034 non-null  object        
 20  IncGeo_WardNameNew                      713034 non-null  object        
 21  Easting_m                               321180 non-null  float64       
 22  Northing_m                              321180 non-null  float64       
 23  Easting_rounded                         713368 non-null  int64         
 24  Northing_rounded                        713368 non-null  int64         
 25  Latitude                                321180 non-null  float64       
 26  Longitude                               321180 non-null  float64       
 27  FRS                                     713368 non-null  object        
 28  IncidentStationGround                   713367 non-null  object        
 29  FirstPumpArriving_AttendanceTime        672670 non-null  float64       
 30  FirstPumpArriving_DeployedFromStation   672664 non-null  object        
 31  SecondPumpArriving_AttendanceTime       262206 non-null  float64       
 32  SecondPumpArriving_DeployedFromStation  262201 non-null  object        
 33  NumStationsWithPumpsAttending           705707 non-null  float64       
 34  NumPumpsAttending                       705707 non-null  float64       
 35  PumpCount                               713368 non-null  int64         
 36  PumpMinutesRounded                      713368 non-null  int64         
 37  Notional Cost (£)                       713368 non-null  int64         
 38  NumCalls                                713350 non-null  float64       
dtypes: datetime64[ns](1), float64(9), int64(9), object(20)
memory usage: 212.3+ MB
In [19]:
incident_2018_onwards.describe()
Out[19]:
DateOfCall CalYear HourOfCall UPRN USRN Easting_m Northing_m Easting_rounded Northing_rounded Latitude Longitude FirstPumpArriving_AttendanceTime SecondPumpArriving_AttendanceTime NumStationsWithPumpsAttending NumPumpsAttending PumpCount PumpMinutesRounded Notional Cost (£) NumCalls
count 713368 713368.000000 713368.000000 7.133680e+05 7.133680e+05 321180.000000 321180.000000 713368.000000 713368.000000 321180.000000 321180.000000 672670.000000 262206.000000 705707.000000 705707.000000 713368.000000 713368.000000 713368.000000 713350.000000
mean 2021-04-28 07:14:44.063204352 2020.830313 13.384972 2.009105e+10 2.040403e+07 530632.285466 180357.972424 530629.489969 180461.024324 51.356573 -0.118512 313.183936 393.080071 1.396969 1.550634 1.597286 76.667644 453.700487 1.315119
min 2018-01-01 00:00:00 2018.000000 0.000000 0.000000e+00 4.200558e+06 503582.000000 155901.000000 503550.000000 155950.000000 0.000000 -0.510155 1.000000 1.000000 1.000000 1.000000 1.000000 60.000000 328.000000 1.000000
25% 2019-09-12 00:00:00 2019.000000 9.000000 0.000000e+00 2.040099e+07 524995.000000 175869.000000 525150.000000 176050.000000 51.467390 -0.199705 230.000000 299.000000 1.000000 1.000000 1.000000 60.000000 339.000000 1.000000
50% 2021-06-23 00:00:00 2021.000000 14.000000 0.000000e+00 2.120112e+07 530816.000000 180976.000000 530950.000000 181050.000000 51.512776 -0.116616 295.000000 370.000000 1.000000 1.000000 1.000000 60.000000 352.000000 1.000000
75% 2022-12-08 00:00:00 2022.000000 19.000000 1.001284e+10 2.210019e+07 536962.000000 185025.000000 536350.000000 185250.000000 51.548591 -0.026067 373.000000 460.000000 2.000000 2.000000 2.000000 60.000000 388.000000 1.000000
max 2024-04-30 00:00:00 2024.000000 23.000000 2.000044e+11 9.999042e+07 561126.000000 200885.000000 611150.000000 302450.000000 51.691458 0.322219 1200.000000 1200.000000 14.000000 14.000000 250.000000 72157.000000 407687.000000 175.000000
std NaN 1.858690 6.294854 4.536757e+10 4.606893e+06 10402.854680 7494.943347 9727.060482 7429.181942 2.780262 0.149779 132.815172 147.129355 0.769330 0.905426 1.479428 310.087012 1802.093519 1.519300

Dataframe¶

In [7]:
# Fusionner mobilisation et incident
df = pd.merge(mobi_2021, incident_2018_onwards,  on=['IncidentNumber',"CalYear", "HourOfCall"],how="inner")
In [21]:
df.head()
Out[21]:
IncidentNumber CalYear HourOfCall ResourceMobilisationId Resource_Code PerformanceReporting DateAndTimeMobilised DateAndTimeMobile DateAndTimeArrived TurnoutTimeSeconds ... FirstPumpArriving_AttendanceTime FirstPumpArriving_DeployedFromStation SecondPumpArriving_AttendanceTime SecondPumpArriving_DeployedFromStation NumStationsWithPumpsAttending NumPumpsAttending PumpCount PumpMinutesRounded Notional Cost (£) NumCalls
0 000004-01012021 2021 0 5769249 A321 1 2021-01-01 00:06:47 2021-01-01 00:07:35 2021-01-01 00:09:48 48.0 ... 181.0 Hornsey NaN NaN 1.0 1.0 1 60 346 1.0
1 000005-01012021 2021 0 5769250 F351 1 2021-01-01 00:07:47 2021-01-01 00:09:41 2021-01-01 00:11:57 114.0 ... 250.0 Woodford NaN NaN 1.0 1.0 1 60 346 1.0
2 000006-01012021 2021 0 5769251 F412 1 2021-01-01 00:08:21 2021-01-01 00:10:32 2021-01-01 00:14:37 131.0 ... 376.0 Dagenham NaN NaN 1.0 1.0 1 60 346 1.0
3 000007-01012021 2021 0 5769252 H331 1 2021-01-01 00:12:23 2021-01-01 00:13:16 2021-01-01 00:19:12 53.0 ... 409.0 Wandsworth 445.0 Fulham 5.0 5.0 5 99 571 4.0
4 000007-01012021 2021 0 5769253 G351 2 2021-01-01 00:12:23 2021-01-01 00:13:32 2021-01-01 00:19:48 69.0 ... 409.0 Wandsworth 445.0 Fulham 5.0 5.0 5 99 571 4.0

5 rows × 57 columns

In [22]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 585577 entries, 0 to 585576
Data columns (total 57 columns):
 #   Column                                  Non-Null Count   Dtype         
---  ------                                  --------------   -----         
 0   IncidentNumber                          585577 non-null  object        
 1   CalYear                                 585577 non-null  int64         
 2   HourOfCall                              585577 non-null  int64         
 3   ResourceMobilisationId                  585577 non-null  int64         
 4   Resource_Code                           585577 non-null  object        
 5   PerformanceReporting                    585577 non-null  object        
 6   DateAndTimeMobilised                    585577 non-null  datetime64[ns]
 7   DateAndTimeMobile                       583088 non-null  datetime64[ns]
 8   DateAndTimeArrived                      585577 non-null  datetime64[ns]
 9   TurnoutTimeSeconds                      583070 non-null  float64       
 10  TravelTimeSeconds                       583072 non-null  float64       
 11  AttendanceTimeSeconds                   585577 non-null  int64         
 12  DateAndTimeLeft                         585356 non-null  datetime64[ns]
 13  DeployedFromStation_Code                585574 non-null  object        
 14  DeployedFromStation_Name                585574 non-null  object        
 15  DeployedFromLocation                    585137 non-null  object        
 16  PumpOrder                               585577 non-null  int64         
 17  PlusCode_Code                           585577 non-null  object        
 18  PlusCode_Description                    585577 non-null  object        
 19  DelayCodeId                             143471 non-null  float64       
 20  DelayCode_Description                   143471 non-null  object        
 21  DateOfCall                              585577 non-null  datetime64[ns]
 22  TimeOfCall                              585577 non-null  object        
 23  IncidentGroup                           585577 non-null  object        
 24  StopCodeDescription                     585577 non-null  object        
 25  SpecialServiceType                      144498 non-null  object        
 26  PropertyCategory                        585577 non-null  object        
 27  PropertyType                            585577 non-null  object        
 28  AddressQualifier                        585577 non-null  object        
 29  Postcode_full                           217851 non-null  object        
 30  Postcode_district                       585577 non-null  object        
 31  UPRN                                    585577 non-null  int64         
 32  USRN                                    585577 non-null  int64         
 33  IncGeo_BoroughCode                      585577 non-null  object        
 34  IncGeo_BoroughName                      585577 non-null  object        
 35  ProperCase                              585577 non-null  object        
 36  IncGeo_WardCode                         585130 non-null  object        
 37  IncGeo_WardName                         585130 non-null  object        
 38  IncGeo_WardNameNew                      585130 non-null  object        
 39  Easting_m                               217851 non-null  float64       
 40  Northing_m                              217851 non-null  float64       
 41  Easting_rounded                         585577 non-null  int64         
 42  Northing_rounded                        585577 non-null  int64         
 43  Latitude                                217851 non-null  float64       
 44  Longitude                               217851 non-null  float64       
 45  FRS                                     585577 non-null  object        
 46  IncidentStationGround                   585575 non-null  object        
 47  FirstPumpArriving_AttendanceTime        585576 non-null  float64       
 48  FirstPumpArriving_DeployedFromStation   585571 non-null  object        
 49  SecondPumpArriving_AttendanceTime       348025 non-null  float64       
 50  SecondPumpArriving_DeployedFromStation  348021 non-null  object        
 51  NumStationsWithPumpsAttending           585577 non-null  float64       
 52  NumPumpsAttending                       585577 non-null  float64       
 53  PumpCount                               585577 non-null  int64         
 54  PumpMinutesRounded                      585577 non-null  int64         
 55  Notional Cost (£)                       585577 non-null  int64         
 56  NumCalls                                585571 non-null  float64       
dtypes: datetime64[ns](5), float64(12), int64(12), object(28)
memory usage: 254.7+ MB
In [23]:
(df.isnull().sum()/len(df))*100
Out[23]:
IncidentNumber                             0.000000
CalYear                                    0.000000
HourOfCall                                 0.000000
ResourceMobilisationId                     0.000000
Resource_Code                              0.000000
PerformanceReporting                       0.000000
DateAndTimeMobilised                       0.000000
DateAndTimeMobile                          0.425051
DateAndTimeArrived                         0.000000
TurnoutTimeSeconds                         0.428125
TravelTimeSeconds                          0.427783
AttendanceTimeSeconds                      0.000000
DateAndTimeLeft                            0.037741
DeployedFromStation_Code                   0.000512
DeployedFromStation_Name                   0.000512
DeployedFromLocation                       0.075140
PumpOrder                                  0.000000
PlusCode_Code                              0.000000
PlusCode_Description                       0.000000
DelayCodeId                               75.499208
DelayCode_Description                     75.499208
DateOfCall                                 0.000000
TimeOfCall                                 0.000000
IncidentGroup                              0.000000
StopCodeDescription                        0.000000
SpecialServiceType                        75.323826
PropertyCategory                           0.000000
PropertyType                               0.000000
AddressQualifier                           0.000000
Postcode_full                             62.797207
Postcode_district                          0.000000
UPRN                                       0.000000
USRN                                       0.000000
IncGeo_BoroughCode                         0.000000
IncGeo_BoroughName                         0.000000
ProperCase                                 0.000000
IncGeo_WardCode                            0.076335
IncGeo_WardName                            0.076335
IncGeo_WardNameNew                         0.076335
Easting_m                                 62.797207
Northing_m                                62.797207
Easting_rounded                            0.000000
Northing_rounded                           0.000000
Latitude                                  62.797207
Longitude                                 62.797207
FRS                                        0.000000
IncidentStationGround                      0.000342
FirstPumpArriving_AttendanceTime           0.000171
FirstPumpArriving_DeployedFromStation      0.001025
SecondPumpArriving_AttendanceTime         40.567167
SecondPumpArriving_DeployedFromStation    40.567850
NumStationsWithPumpsAttending              0.000000
NumPumpsAttending                          0.000000
PumpCount                                  0.000000
PumpMinutesRounded                         0.000000
Notional Cost (£)                          0.000000
NumCalls                                   0.001025
dtype: float64
In [24]:
df["DeployedFromStation_Name"].unique()
Out[24]:
array(['Hornsey', 'Woodford', 'Dagenham', 'Wandsworth', 'Fulham',
       'Wimbledon', 'Battersea', 'Tooting', 'Stratford', 'Acton',
       'Chiswick', 'Ealing', 'Hammersmith', 'Park Royal', 'Brixton',
       'Mitcham', 'Soho', 'Paddington', 'Euston', 'Islington', 'Lambeth',
       'Holloway', 'Barnet', 'Shoreditch', 'Whitechapel', 'Kentish Town',
       'Heston', 'Northolt', 'Harrow', 'Kensington', 'Chelsea',
       'North Kensington', 'East Ham', 'Clapham', 'Dowgate',
       'Old Kent Road', 'Dockhead', 'Peckham', 'Willesden', 'Feltham',
       'Southall', 'Stanmore', 'Hendon', 'Mill Hill', 'Finchley',
       'Forest Hill', 'Twickenham', 'Tottenham', 'Wallington', 'Hayes',
       'Hornchurch', 'Kingston', 'Leyton', 'Southgate', 'Surbiton',
       'Plumstead', 'Heathrow', 'Harold Hill', 'Romford', 'Ilford',
       'Woodside', 'Edmonton', 'Sutton', 'West Hampstead', 'New Malden',
       'Croydon', 'Shadwell', 'Enfield', 'Homerton', 'Leytonstone',
       'New Cross', 'Richmond', 'Walthamstow', 'Purley', 'Norbury',
       'West Norwood', 'Lewisham', 'Plaistow', 'Erith', 'Beckenham',
       'Greenwich', 'Deptford', 'Stoke Newington', 'Bethnal Green',
       'Orpington', 'Chingford', 'Poplar', 'Barking', 'Wembley',
       'Ruislip', 'East Greenwich', 'Hillingdon', 'Hainault', 'Eltham',
       'Lee Green', 'Bromley', 'Bexley', 'Addington', 'Millwall',
       'Sidcup', 'Biggin Hill', 'Wennington', nan, 'Hertfordshire',
       'Fordbridge', 'Buckinghamshire'], dtype=object)
In [25]:
gps_station.head()
Out[25]:
Fire Station Latitude Longitude Geo_zone Categ_geo
0 Hornsey 51.586460 -0.127290 Nord Banlieue
1 Woodford 51.607600 0.033600 Nord-Est Banlieue
2 Dagenham 51.559266 0.156838 Est Banlieue
3 Wandsworth 51.458300 -0.192100 Sud-Ouest Urbain
4 Fulham 51.474400 -0.204100 Ouest Urbain
In [26]:
gps_station.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94 entries, 0 to 93
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Fire Station  94 non-null     object 
 1   Latitude      94 non-null     float64
 2   Longitude     94 non-null     float64
 3   Geo_zone      94 non-null     object 
 4   Categ_geo     94 non-null     object 
dtypes: float64(2), object(3)
memory usage: 3.8+ KB

FEATURE ENGINEERING¶

In [8]:
# Création d'un dataset des coordonnées géographique des stations
gps_station.rename(columns={"Fire Station": "DeployedFromStation_Name"}, inplace=True)
In [9]:
df = pd.merge(df, gps_station, how="left", on="DeployedFromStation_Name")
In [29]:
df.head()
Out[29]:
IncidentNumber CalYear HourOfCall ResourceMobilisationId Resource_Code PerformanceReporting DateAndTimeMobilised DateAndTimeMobile DateAndTimeArrived TurnoutTimeSeconds ... NumStationsWithPumpsAttending NumPumpsAttending PumpCount PumpMinutesRounded Notional Cost (£) NumCalls Latitude_y Longitude_y Geo_zone Categ_geo
0 000004-01012021 2021 0 5769249 A321 1 2021-01-01 00:06:47 2021-01-01 00:07:35 2021-01-01 00:09:48 48.0 ... 1.0 1.0 1 60 346 1.0 51.586460 -0.127290 Nord Banlieue
1 000005-01012021 2021 0 5769250 F351 1 2021-01-01 00:07:47 2021-01-01 00:09:41 2021-01-01 00:11:57 114.0 ... 1.0 1.0 1 60 346 1.0 51.607600 0.033600 Nord-Est Banlieue
2 000006-01012021 2021 0 5769251 F412 1 2021-01-01 00:08:21 2021-01-01 00:10:32 2021-01-01 00:14:37 131.0 ... 1.0 1.0 1 60 346 1.0 51.559266 0.156838 Est Banlieue
3 000007-01012021 2021 0 5769252 H331 1 2021-01-01 00:12:23 2021-01-01 00:13:16 2021-01-01 00:19:12 53.0 ... 5.0 5.0 5 99 571 4.0 51.458300 -0.192100 Sud-Ouest Urbain
4 000007-01012021 2021 0 5769253 G351 2 2021-01-01 00:12:23 2021-01-01 00:13:32 2021-01-01 00:19:48 69.0 ... 5.0 5.0 5 99 571 4.0 51.474400 -0.204100 Ouest Urbain

5 rows × 61 columns

In [10]:
df.rename(columns ={"Latitude_y" : "Latitude_station_deploy","Longitude_y": "Longitude_station_deploy", "Latitude_x" : "Latitude_incident","Longitude_x": "Longitude_incident"}, inplace = True)

Gestions des valeurs manquantes sur les variables géographiques¶

In [11]:
df = df.dropna(subset=['Longitude_incident'])
In [12]:
df['Longitude_incident'].isnull().sum()
Out[12]:
0

Variables géographiques¶

In [13]:
from geopy.distance import geodesic

# Fonction pour calculer la distance entre la station et l'incident :
def calculate_distance(row):
    station_coords = (row["Latitude_station_deploy"], row["Longitude_station_deploy"])
    incident_coords = (row["Latitude_incident"], row["Longitude_incident"])
    
    # Vérification des coordonnées :
    if not np.isnan(station_coords).any() and not np.isnan(incident_coords).any():
        return geodesic(station_coords, incident_coords).kilometers
    else:
        return np.nan

# Ajouter la fonctionnalité - Distance entre la station et l'incident en kilometres
df["DistanceToIncident_km"] = df.apply(calculate_distance, axis=1)

# Calculer et ajouter le temps moyen de réponse par station
station_avg_response_time = df.groupby("FirstPumpArriving_DeployedFromStation")["FirstPumpArriving_AttendanceTime"].mean().to_dict()
df["AvgResponseTimeByStation"] = df["FirstPumpArriving_DeployedFromStation"].map(station_avg_response_time)

# Calculer le nombre d'incidents par zone
incident_zone_density = df.groupby("Geo_zone")["IncidentNumber"].count().to_dict()
df["IncidentDensityByZone"] = df["Geo_zone"].map(incident_zone_density)


df.head()
Out[13]:
IncidentNumber CalYear HourOfCall ResourceMobilisationId Resource_Code PerformanceReporting DateAndTimeMobilised DateAndTimeMobile DateAndTimeArrived TurnoutTimeSeconds ... PumpMinutesRounded Notional Cost (£) NumCalls Latitude_station_deploy Longitude_station_deploy Geo_zone Categ_geo DistanceToIncident_km AvgResponseTimeByStation IncidentDensityByZone
0 000004-01012021 2021 0 5769249 A321 1 2021-01-01 00:06:47 2021-01-01 00:07:35 2021-01-01 00:09:48 48.0 ... 60 346 1.0 51.586460 -0.127290 Nord Banlieue 1.178989 305.351016 25304.0
1 000005-01012021 2021 0 5769250 F351 1 2021-01-01 00:07:47 2021-01-01 00:09:41 2021-01-01 00:11:57 114.0 ... 60 346 1.0 51.607600 0.033600 Nord-Est Banlieue 0.228468 367.854604 4567.0
2 000006-01012021 2021 0 5769251 F412 1 2021-01-01 00:08:21 2021-01-01 00:10:32 2021-01-01 00:14:37 131.0 ... 60 346 1.0 51.559266 0.156838 Est Banlieue 2.133463 362.570811 40494.0
8 000009-01012021 2021 0 5769259 F211 1 2021-01-01 00:14:51 2021-01-01 00:16:37 2021-01-01 00:20:53 106.0 ... 60 346 2.0 51.541700 -0.002800 Est Urbain 1.931720 299.386401 40494.0
15 000015-01012021 2021 0 5769268 H361 1 2021-01-01 00:16:27 2021-01-01 00:17:15 2021-01-01 00:20:32 48.0 ... 60 346 1.0 51.400500 -0.168000 Sud Banlieue 1.003831 318.683989 26344.0

5 rows × 64 columns

In [34]:
print(df["DistanceToIncident_km"].mean())
print(df["AttendanceTimeSeconds"].mean())
27.16544992287631
345.3496518262482

Temporalité des incidents¶

In [14]:
df["Month"] = df["DateAndTimeMobilised"].dt.month_name()
df["DayOfWeek"] = df["DateAndTimeMobilised"].dt.day_name()
In [36]:
df.head()
Out[36]:
IncidentNumber CalYear HourOfCall ResourceMobilisationId Resource_Code PerformanceReporting DateAndTimeMobilised DateAndTimeMobile DateAndTimeArrived TurnoutTimeSeconds ... NumCalls Latitude_station_deploy Longitude_station_deploy Geo_zone Categ_geo DistanceToIncident_km AvgResponseTimeByStation IncidentDensityByZone Month DayOfWeek
0 000004-01012021 2021 0 5769249 A321 1 2021-01-01 00:06:47 2021-01-01 00:07:35 2021-01-01 00:09:48 48.0 ... 1.0 51.586460 -0.127290 Nord Banlieue 1.178989 305.351016 25304.0 January Friday
1 000005-01012021 2021 0 5769250 F351 1 2021-01-01 00:07:47 2021-01-01 00:09:41 2021-01-01 00:11:57 114.0 ... 1.0 51.607600 0.033600 Nord-Est Banlieue 0.228468 367.854604 4567.0 January Friday
2 000006-01012021 2021 0 5769251 F412 1 2021-01-01 00:08:21 2021-01-01 00:10:32 2021-01-01 00:14:37 131.0 ... 1.0 51.559266 0.156838 Est Banlieue 2.133463 362.570811 40494.0 January Friday
8 000009-01012021 2021 0 5769259 F211 1 2021-01-01 00:14:51 2021-01-01 00:16:37 2021-01-01 00:20:53 106.0 ... 2.0 51.541700 -0.002800 Est Urbain 1.931720 299.386401 40494.0 January Friday
15 000015-01012021 2021 0 5769268 H361 1 2021-01-01 00:16:27 2021-01-01 00:17:15 2021-01-01 00:20:32 48.0 ... 1.0 51.400500 -0.168000 Sud Banlieue 1.003831 318.683989 26344.0 January Friday

5 rows × 66 columns

In [15]:
def segment_time_of_day(hour):
    if 0 <= hour < 4:
        return "Night"
    elif 4 <= hour < 8:
        return "Early Morning"
    elif 8 <= hour < 12:
        return "Morning"
    elif 12 <= hour < 16:
        return "Afternoon"
    elif 16 <= hour < 20:
        return "Evening"
    else:  # 20 <= hour < 24
        return "Late Evening"

# Appliquer la fonction à la colonne HourOfCall pour créer une nouvelle colonne 'TimeOfDay'
df["TimeOfDay"] = df["HourOfCall"].apply(segment_time_of_day)
In [16]:
df["IsWeekend"] = df["DayOfWeek"].isin(["Saturday", "Sunday"]).astype(int) 
df["Season"] = df["Month"].map({"January": "Winter", "February": "Winter", "March": "Spring", "April": "Spring",
                                "May": "Spring", "June": "Summer", "July": "Summer", "August": "Summer",
                                "September": "Fall", "October": "Fall", "November": "Fall", "December": "Winter"})
In [17]:
# Encodage cyclique des heures
df["HourOfCall_sin"] = np.sin(2 * np.pi * df["HourOfCall"] / 24)
df["HourOfCall_cos"] = np.cos(2 * np.pi * df["HourOfCall"] / 24)

# Encodage cyclique des jours
weekday_mapping = {"Monday": 1, "Tuesday": 2, "Wednesday": 3, "Thursday": 4, "Friday": 5, "Saturday": 6, "Sunday": 7}
df["DayOfWeek"] = df["DayOfWeek"].map(weekday_mapping)

df["DayOfWeek_sin"] = np.sin(2 * np.pi * df["DayOfWeek"] / 7)
df["DayOfWeek_cos"] = np.cos(2 * np.pi * df["DayOfWeek"] / 7)
In [18]:
month_mapping = {"January": 1, "February": 2, "March": 3, "April": 4,
    "May": 5, "June": 6, 'July': 7, "August": 8,
    "September": 9, "October": 10, "November": 11, "December": 12}

df["Month"] = df["Month"].map(month_mapping)

# Encodage cyclique des mois
df["Month_sin"] = np.sin(2 * np.pi * df["Month"] / 12)
df["Month_cos"] = np.cos(2 * np.pi * df["Month"] / 12)

Criticité incident et retards¶

In [29]:
# Normaliser la colonne 'IncidentSeverity' pour qu'elle soit sur une échelle de 20
df["IncidentSeverity"] = df["NumPumpsAttending"] + df["PumpOrder"]

# Mise à l'échelle de 'IncidentSeverity' sur une échelle de 20
max_severity = df["IncidentSeverity"].max()
df["IncidentSeverity"] = (df["IncidentSeverity"] / max_severity) * 20
In [30]:
plt.figure(figsize=(10, 6))
sns.histplot(df["IncidentSeverity"], bins=20, kde=True, color='blue')
plt.title("Distribution de Incident Severity")
plt.xlabel("Incident Severity")
plt.ylabel("Nombre d'incidents")
plt.show()
No description has been provided for this image
In [19]:
# Si 'DelayCode_Description' est non nul, cela veut dire qu'il y a un retard

df["HasDelay"] = df["DelayCode_Description"].notna().astype(int)
In [20]:
# Calculer la moyenne du temps de réponse pour chaque station
df["AvgResponseTimeByStation"] = df.groupby("DeployedFromStation_Name")["AttendanceTimeSeconds"].transform("mean")

# Calculer la densité des incidents par zone géographique
incident_density = df.groupby("Geo_zone")["IncidentNumber"].transform("count")
df["IncidentsDensityByZone"] = incident_density
In [21]:
# Calcul des coefficients basés sur la moyenne des temps de réponse par TimeOfDay
time_of_day_avg = df.groupby("TimeOfDay")["AttendanceTimeSeconds"].mean() 
global_avg_time = df["AttendanceTimeSeconds"].mean() 
df["TimeOfDayWeight"] = df["TimeOfDay"].map(time_of_day_avg / global_avg_time)

# Interaction entre TimeOfDay et DistanceToIncident_km basée sur des données réelles
df["DistanceTimeInteraction"] = df["DistanceToIncident_km"] * df["TimeOfDayWeight"]

# Calcul des coefficients pour Geo_zone basés sur le nombre moyen de camions mobilisés par zone
geo_zone_avg_pumps = df.groupby("Geo_zone")["NumPumpsAttending"].mean()
df["GeoZonePumpsInteraction"] = df["NumPumpsAttending"] * df["Geo_zone"].map(geo_zone_avg_pumps)
In [46]:
fig, ax = plt.subplots(figsize=(12, 6))

#  Distribution de 'GeoZonePumpsInteraction'

sns.histplot(df["GeoZonePumpsInteraction"], bins=20, kde=True, ax=ax, color="green")
ax.set_title("Distribution de GeoZonePumpsInteraction")
ax.set_xlabel("GeoZonePumpsInteraction")
ax.set_ylabel("Nombre d'incidents")

plt.tight_layout()
plt.show()
No description has been provided for this image
In [22]:
# Différence entre le temps d'arrivée du premier et du deuxième camion
df["TimeDifference_FirstSecondPump"] = df["SecondPumpArriving_AttendanceTime"] - df["FirstPumpArriving_AttendanceTime"]

# Ratio entre le temps du deuxième et du premier camion 
df["Ratio_SecondToFirstPump"] = df["SecondPumpArriving_AttendanceTime"] / df["FirstPumpArriving_AttendanceTime"]
df["Ratio_SecondToFirstPump"] = df["Ratio_SecondToFirstPump"].replace([np.inf, -np.inf], np.nan)  # Remplacer les divisions par zéro par NaN

# Moyenne des temps d'arrivée du premier et du deuxième camion
df["MeanPumpArrivalTime"] = (df["FirstPumpArriving_AttendanceTime"] + df["SecondPumpArriving_AttendanceTime"]) / 2

# Indicateur binaire : est-ce qu'un deuxième camion est arrivé ?
df["SecondPumpArrived"] = df["SecondPumpArriving_AttendanceTime"].notna().astype(int)
In [48]:
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

#  Distribution de la différence entre le premier et le deuxième camion
sns.histplot(df["TimeDifference_FirstSecondPump"].dropna(), bins=20, kde=True, ax=axes[0], color='blue')
axes[0].set_title("Différence entre le temps du 1er et du 2e camion")
axes[0].set_xlabel("Temps de différence en secondes")
axes[0].set_ylabel("Nombre d'incidents")

#  Distribution de la moyenne des temps d'arrivée des deux camions
sns.histplot(df["MeanPumpArrivalTime"].dropna(), bins=20, kde=True, ax=axes[1], color="purple")
axes[1].set_title("Moyenne des temps d\'arrivée des camions")
axes[1].set_xlabel("Temps moyen en secondes")
axes[1].set_ylabel("Nombre d'incidents")


plt.tight_layout()
plt.show()
No description has been provided for this image
In [23]:
# Remplacer les valeurs NaN dans 'SecondPumpArriving_DeployedFromStation' par "Pas de seconde pompe"
df["SecondPumpArriving_DeployedFromStation"].fillna("Pas de seconde pompe", inplace=True)

# Remplacer les valeurs NaN dans 'SecondPumpArriving_AttendanceTime' par "Pas de seconde pompe"
df["SecondPumpArriving_AttendanceTime"].fillna("Pas de seconde pompe", inplace=True)

Gestion des outliers¶

In [50]:
# distribution du temps d'intervention
sns.set(style = "whitegrid")
plt.figure(figsize=(10,6))
sns.histplot(df["AttendanceTimeSeconds"], color="forestgreen", kde=True)
plt.title("Distribution du temps d'intervention")
plt.xlabel('temps en seconde')
plt.ylabel('Fréquence')
plt.show()
No description has been provided for this image
In [51]:
# boxplot "AttendanceTimeSeconds"
plt.figure(figsize=(10,6))
sns.boxplot(x=df["AttendanceTimeSeconds"], color='purple')
plt.title('outliers analysis')
plt.xlabel('Attendance Time')
plt.show()
No description has been provided for this image

Outliers "AttendanceTimeSeconds"¶

In [24]:
# Définir IQR
Q1 = df["AttendanceTimeSeconds"].quantile(0.25)
print("Q1:",Q1)
Q3 = df["AttendanceTimeSeconds"].quantile(0.75)
print("Q3:",Q3)
IQR = Q3 - Q1
print("IQR:",IQR)

# Définir les limites des outliers
limite_inf = Q1 - 1.5 * IQR
limite_sup = Q3 + 1.5 * IQR

# Suppression des outliers 
df_no_outliers = df[(df["AttendanceTimeSeconds"] >= limite_inf) & (df["AttendanceTimeSeconds"] <= limite_sup)]

# Vérification shape data avant et après suppression outliers
print("shape data avant suppression outliers :", df.shape)
print("shape data après suppression outliers :", df_no_outliers.shape)
print("nombre d'outilers supprimés :", (df.shape[0] - df_no_outliers.shape[0]))
Q1: 244.0
Q3: 418.0
IQR: 174.0
shape data avant suppression outliers : (217851, 84)
shape data après suppression outliers : (209690, 84)
nombre d'outilers supprimés : 8161
In [53]:
# distribution du temps d'intervention sans outliers

sns.set(style = "whitegrid")
plt.figure(figsize=(10,6))
sns.histplot(df_no_outliers["AttendanceTimeSeconds"], color="forestgreen", kde=True)
plt.title("Distribution du temps d'intervention")
plt.xlabel('temps en seconde')
plt.ylabel('Fréquence')
plt.show()
No description has been provided for this image
In [54]:
# distribution du temps de préparation (TurnoutTimeSeconds)

sns.set(style = "whitegrid")
plt.figure(figsize=(10,6))
sns.histplot(df_no_outliers["TurnoutTimeSeconds"], color="forestgreen", kde=True)
plt.title("Distribution du temps de préparation")
plt.xlabel('temps en seconde')
plt.ylabel('Fréquence')
plt.show()
No description has been provided for this image
In [55]:
# boxplot "TurnoutTimeSconds"

plt.figure(figsize=(10,6))
sns.boxplot(x=df_no_outliers["TurnoutTimeSeconds"], color='purple')
plt.title('outliers analysis')
plt.xlabel('Turnout Time')
plt.show()
No description has been provided for this image

Outliers "TurnoutTimeSeconds"¶

In [25]:
# Définir IQR
Q1 = df_no_outliers["TurnoutTimeSeconds"].quantile(0.25)
print("Q1:",Q1)
Q3 = df_no_outliers["TurnoutTimeSeconds"].quantile(0.75)
print("Q3:",Q3)
IQR = Q3 - Q1
print("IQR:",IQR)

# Définir les limites des outliers
limite_inf = Q1 - 1.5 * IQR
limite_sup = Q3 + 1.5 * IQR

# Suppression des outliers 
df_no_outliers_2 = df_no_outliers[(df_no_outliers["TurnoutTimeSeconds"] >= limite_inf) & (df_no_outliers["TurnoutTimeSeconds"] <= limite_sup)]

# Vérification shape data avant et après suppression outliers
print("shape data avant suppression outliers :", df_no_outliers.shape)
print("shape data après suppression outliers :", df_no_outliers_2.shape)
print("nombre d'outilers supprimés :", (df_no_outliers.shape[0] - df_no_outliers_2.shape[0]))
Q1: 50.0
Q3: 89.0
IQR: 39.0
shape data avant suppression outliers : (209690, 84)
shape data après suppression outliers : (202361, 84)
nombre d'outilers supprimés : 7329
In [57]:
# distribution du temps de préparation sans outliers (TurnoutTimeSeconds)

sns.set(style = "whitegrid")
plt.figure(figsize=(10,6))
sns.histplot(df_no_outliers_2["TurnoutTimeSeconds"], color="forestgreen", kde=True)
plt.title("Distribution du temps de préparation")
plt.xlabel('temps en seconde')
plt.ylabel('Fréquence')
plt.show()
No description has been provided for this image
In [58]:
# distribution du temps de trajet (TravelTimeSeconds)

sns.set(style = "whitegrid")
plt.figure(figsize=(10,6))
sns.histplot(df_no_outliers_2["TravelTimeSeconds"], color="forestgreen", kde=True)
plt.title("Distribution du temps de trajet")
plt.xlabel('temps en seconde')
plt.ylabel('Fréquence')
plt.show()
No description has been provided for this image
In [59]:
# boxplot "TravelTimeSeconds"

plt.figure(figsize=(10,6))
sns.boxplot(x=df_no_outliers_2["TravelTimeSeconds"], color='purple')
plt.title('outliers analysis')
plt.xlabel('TravelTime')
plt.show()
No description has been provided for this image

Outliers "TravelTimeSeconds"¶

In [26]:
# Définir IQR
Q1 = df_no_outliers_2["TravelTimeSeconds"].quantile(0.25)
print("Q1:",Q1)
Q3 = df_no_outliers_2["TravelTimeSeconds"].quantile(0.75)
print("Q3:",Q3)
IQR = Q3 - Q1
print("IQR:",IQR)

# Définir les limites des outliers
limite_inf = Q1 - 1.5 * IQR
limite_sup = Q3 + 1.5 * IQR

# Suppression des outliers 
df_no_outliers_3 = df_no_outliers_2[(df_no_outliers_2["TravelTimeSeconds"] >= limite_inf) & (df_no_outliers_2["TravelTimeSeconds"] <= limite_sup)]

# Vérification shape data avant et après suppression outliers
print("shape data avant suppression outliers :", df_no_outliers_2.shape)
print("shape data après suppression outliers :", df_no_outliers_3.shape)
print("nombre d'outilers supprimés :", (df_no_outliers_2.shape[0] - df_no_outliers_3.shape[0]))
Q1: 171.0
Q3: 332.0
IQR: 161.0
shape data avant suppression outliers : (202361, 84)
shape data après suppression outliers : (200068, 84)
nombre d'outilers supprimés : 2293
In [61]:
# distribution du temps de trajet sans outliers (TravelTimeSeconds)

sns.set(style = "whitegrid")
plt.figure(figsize=(10,6))
sns.histplot(df_no_outliers_3["TravelTimeSeconds"], color="forestgreen", kde=True)
plt.title("Distribution du temps de trajet")
plt.xlabel('temps en seconde')
plt.ylabel('Fréquence')
plt.show()
No description has been provided for this image
In [27]:
df = df_no_outliers_3

DATAVIZ¶

  1. Analyse des temps de mobilisation, de déplacement et d'intervention : "TurnoutTimeSeconds", "TravelTimeSeconds" et "AttendanceTimeSeconds"
  2. Facteurs géographiques: "IncGeo_BoroughName", "IncGeo_WardName", "Latitude", et "Longitude"
  3. Ressources mobilisés et stations de déploiement: "DeployedFromStation_Name", "PumpOrder", et "NumPumpsAttending"
  4. Temps d'appel et d'arrivée: "DateAndTimeMobilised", "DateAndTimeMobile", "DateAndTimeArrived", et "DateAndTimeLeft"
  5. Motifs de retard et retards : "DelayCodeId" et "DelayCode_Description"

Viz :

  1. Boxplot temps d'intervention par région : voir la distrib des temps d'intervention par zone
  2. Heatmap des temps de déplacement: temps de déplacement moyen par zone
  3. histogramme des temps de mobilisation de deplacement et d'intervention : distribution des temps
  4. scatterplot des temps d'intervention par nombre de pompes déployées : comment le nombre de ressources affectent le temps d'intervention
  5. Un histogramme pour visualiser les motifs les plus fréquent justifiant les retards
In [63]:
plt.figure(figsize=(14, 7))
sns.boxplot(data=df, x="IncGeo_BoroughName", y="AttendanceTimeSeconds")
plt.xticks(rotation=90)
plt.title("Distribution des temps d\'intervention par région")
plt.xlabel("Région")
plt.ylabel("Temps d\'intervention (secondes)")
plt.show()
No description has been provided for this image
In [64]:
#Heatmap des temps de déplacement moyens par zone géographique
# Arrondir les coordonnées pour réduire le nombre de combinaisons uniques
df["Latitude_incident_rounded"] = df["Latitude_incident"].round(2)
df["Longitude_incident_rounded"] = df["Longitude_incident"].round(2)

plt.figure(figsize=(10, 8))
geo_travel_times = df.groupby(["Latitude_incident_rounded", "Longitude_incident_rounded"])["TravelTimeSeconds"].mean().reset_index()
geo_travel_times_pivot = geo_travel_times.pivot(index="Latitude_incident_rounded", columns="Longitude_incident_rounded", values="TravelTimeSeconds")
sns.heatmap(geo_travel_times_pivot, cmap="Set2")
plt.title("Heatmap des temps de déplacement moyens par zone géographique")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.show()
No description has been provided for this image
In [65]:
#Histogrammes des temps de mobilisation, de déplacement et d'intervention
plt.figure(figsize=(18, 15))

plt.subplot(2, 2, 1)
sns.histplot(df["TurnoutTimeSeconds"], bins=50, kde=True)
plt.title("Distribution des temps de mobilisation")
plt.xlabel("Temps de mobilisation (secondes)")
plt.ylabel("Fréquence")

plt.subplot(2, 2, 2)
sns.histplot(df["TravelTimeSeconds"], bins=50, kde=True)
plt.title("Distribution des temps de déplacement")
plt.xlabel("Temps de déplacement (secondes)")
plt.ylabel("Fréquence")

plt.subplot(2, 2, 3)
sns.histplot(df["AttendanceTimeSeconds"], bins=50, kde=True)
plt.title("Distribution des temps d'intervention")
plt.xlabel("Temps d'intervention (secondes)")
plt.ylabel("Fréquence")


plt.show()
No description has been provided for this image
In [66]:
#Histplot du temps d'intervention par nombre de pompes dépolyées selon le type d'incident
plt.figure(figsize=(12, 6))
sns.barplot(data=df, x="NumPumpsAttending", y="AttendanceTimeSeconds",hue ="IncidentGroup", errorbar=None)
plt.title("Temps d'intervention par nombre de pompes déployées en fonction du type d'incident")
plt.xlabel("Nombre de pompes déployées")
plt.ylabel("Temps d'intervention (secondes)")
plt.show()
No description has been provided for this image
In [67]:
df["Weekday_mobilised"] = df["DateAndTimeMobilised"].dt.weekday

plt.figure(figsize = (16,12))
plt.suptitle('Temps de trajet vers incident', fontsize=22)
plt.subplot(221)
sns.barplot(x = 'Weekday_mobilised', y = 'TravelTimeSeconds', data = df.groupby(['Weekday_mobilised'])['TravelTimeSeconds'].mean().reset_index())
plt.xticks([0, 1, 2, 3,4,5,6], ['lundi','mardi','mercredi','jeudi', 'vendredi', 'samedi', 'dimanche'])
plt.title('Evolution du temps de trajet sur une semaine')
plt.ylabel('Temps de trajet en secondes')
plt.xlabel('Jour')

#,order=["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
plt.subplot(222)
sns.barplot(x = 'HourOfCall', y = 'TravelTimeSeconds', data = df.groupby(['HourOfCall'])['TravelTimeSeconds'].mean().reset_index())
plt.title('Evolution du temps de trajet sur une journée')
plt.xlabel('Heure')
plt.ylabel('Temps de trajet en secondes');
No description has been provided for this image
In [68]:
# Histogramme des retards et leurs motifs
delay_data = df[df["DelayCodeId"].notna()]

# nombre de retards pour chaque motif
delay_counts = delay_data["DelayCode_Description"].value_counts().reset_index()
delay_counts.columns = ["DelayCode_Description", "Count"]

# Graphique en barres des retards par motif
plt.figure(figsize=(14, 7))
sns.barplot(data=delay_counts, x="Count", y="DelayCode_Description", palette="viridis")
plt.title("Nombre de retards par motif de retard")
plt.xlabel("Nombre de retards")
plt.ylabel("Motif de retard")
plt.show()
No description has been provided for this image
In [69]:
plt.figure(figsize=(12,6))

sns.barplot(x = "CalYear", y ="AttendanceTimeSeconds", hue = "IncidentGroup", data = df)
                      
plt.xlabel("Année")
plt.ylabel("Temps d'intervention total en secondes")
plt.title("Temps d'intervention total par année et par type d'incidents")

plt.show()
No description has been provided for this image
In [70]:
plt.figure(figsize=(12,6))

sns.boxplot(x = "CalYear", y ="AttendanceTimeSeconds", data = df)
                      
plt.xlabel("Année")
plt.ylabel("Temps d'intervention total en secondes")
plt.title("Boxplot du temps d'intervention total par année")

plt.show()
No description has been provided for this image
In [71]:
df_unique_incidents = df.drop_duplicates(subset=['IncidentNumber'])

plt.figure(figsize=(14, 7))

sns.countplot(data=df_unique_incidents, y='IncGeo_BoroughName', order = df_unique_incidents['IncGeo_BoroughName'].value_counts().index)

plt.xticks(rotation=90)
plt.xlabel("Nombre d'interventions")
plt.ylabel("Arrondissement")
plt.title("Distribution du nombre d'interventions par arrondissement")

plt.show()
No description has been provided for this image
In [72]:
average_FirstPump = incident_2018_onwards.groupby('CalYear')['FirstPumpArriving_AttendanceTime'].mean()
average_SecondPump = incident_2018_onwards.groupby('CalYear')['SecondPumpArriving_AttendanceTime'].mean()

plt.figure(figsize=(24, 8))
plt.plot(average_FirstPump.index, average_FirstPump.values, color = 'blue', marker='o', linestyle='-', label = '1ère équipe')
plt.plot(average_SecondPump.index, average_SecondPump.values, color = 'red', marker='o', linestyle='-', label = '2ème équipe')

# labels and title
plt.xlabel('Années')
plt.ylabel('Temps arrivé en seconde')
plt.title('Temps moyen intervention 1ère et 2ème équipe')

# Ligne de référence de la moyenne 
plt.axhline(y=average_FirstPump.mean(), color='blue', linestyle='--', label='2009-2024 Moyenne 1ère équipe')
plt.axhline(y=average_SecondPump.mean(), color='red', linestyle='--', label='2009-2024 Moyenne 2ème équipe')

plt.legend()

plt.grid(True)
plt.xticks(rotation=45)

plt.show()
No description has been provided for this image
In [73]:
plt.figure(figsize=(24, 8))

sns.set_style("whitegrid")
sns.histplot(
    data=incident_2018_onwards,
    x="CalYear",
    hue="IncidentGroup",
    multiple="stack",
    stat="count",  
    palette="Set2", )

plt.xlabel('Années')
plt.ylabel('Volume')
plt.title("Evolution et nombre de 'False Alarm' entre 2009 et 2024")

plt.show()
No description has been provided for this image
In [74]:
plt.figure(figsize=(12, 8)) 

plt.pie(incident_2018_onwards.IncidentGroup.value_counts().values, labels=incident_2018_onwards.IncidentGroup.value_counts().index, autopct="%1.1f%%", labeldistance = 1.1)  # Display percentage with one decimal place
plt.title("Répartition du type d'incident") 

plt.show()
No description has been provided for this image
In [75]:
Special_Service = incident_2018_onwards[incident_2018_onwards['IncidentGroup'] == 'Special Service']

plt.figure(figsize=(12, 8))
plt.pie(Special_Service.SpecialServiceType.value_counts().values, labels=Special_Service.SpecialServiceType.value_counts().index, autopct="%1.1f%%", pctdistance=0.8)

plt.title("Répartition des interventions 'Special Service'", fontsize=20)



plt.show()
No description has been provided for this image
In [77]:
fig, axes = plt.subplots(4, 1, figsize=(18, 18))

# Distribution des mois
sns.countplot(ax=axes[0], x="Month", data=df, order=df["Month"].value_counts().index)
axes[0].set_title("Distribution des mois")
axes[0].set_ylabel("Nombre d'incidents")
axes[0].set_xlabel("Mois")

# Distribution des jours de la semaine
sns.countplot(ax=axes[1], x="DayOfWeek", data=df, order=[1, 2, 3, 4, 5, 6 ,7], label=["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"])
axes[1].set_title("Distribution des jours de la semaine")
axes[1].set_ylabel("Nombre d'incidents")
axes[1].set_xlabel("Jour de la semaine")

# Distribution des moments de la journée
sns.countplot(ax=axes[2], x="TimeOfDay", data=df, order=["Night","Early Morning","Morning", "Afternoon", "Evening","Late Evening"])
axes[2].set_title("Distribution des moments de la journée")
axes[2].set_ylabel("Nombre d'incidents")
axes[2].set_xlabel("Moment de la journée")

# Distribution des saisons
sns.countplot(ax=axes[3], x="Season", data=df, order=["Winter", "Spring", "Summer", "Fall"])
axes[3].set_title("Distribution des saisons")
axes[3].set_ylabel("Nombre d'incidents")
axes[3].set_xlabel("Saisons")

plt.tight_layout()
plt.show()
No description has been provided for this image
In [117]:
# Si tu veux enregistrer le DataFrame df dans un fichier CSV
df.to_excel('DF_Pompier.xlsx', index=False)

Selections des variables explicatives¶

In [31]:
lfb = df[["CalYear", "Geo_zone", "Categ_geo", 
    "Latitude_incident", "Longitude_incident", 
    "DistanceToIncident_km", "Latitude_station_deploy", 
    "Longitude_station_deploy", "NumPumpsAttending", 
    "PumpOrder", "PumpCount", 
    "FirstPumpArriving_DeployedFromStation", "FirstPumpArriving_AttendanceTime",
    "SecondPumpArriving_AttendanceTime", "SecondPumpArriving_DeployedFromStation", 
    "IncidentSeverity", "IncidentsDensityByZone", 
    "HasDelay", "GeoZonePumpsInteraction", 
    "DelayCode_Description", "NumStationsWithPumpsAttending", 
    "SpecialServiceType", "Easting_m", "Northing_m", 
    "IncidentGroup", "PropertyType", 
    "Easting_rounded", "Northing_rounded", 
    "IncidentStationGround", "StopCodeDescription", 
    "PlusCode_Description", "DelayCodeId", 
    "PropertyCategory", "AddressQualifier", 
    "AvgResponseTimeByStation", "DistanceTimeInteraction", 
    "AttendanceTimeSeconds",
    "HourOfCall_sin", "HourOfCall_cos", 
    "DayOfWeek_sin", "DayOfWeek_cos",
    "Month_sin", "Month_cos","TimeOfDay"]]

lfb =pd.DataFrame(lfb)
In [79]:
lfb.columns
Out[79]:
Index(['CalYear', 'Geo_zone', 'Categ_geo', 'Latitude_incident',
       'Longitude_incident', 'DistanceToIncident_km',
       'Latitude_station_deploy', 'Longitude_station_deploy',
       'NumPumpsAttending', 'PumpOrder', 'PumpCount',
       'FirstPumpArriving_DeployedFromStation',
       'FirstPumpArriving_AttendanceTime', 'SecondPumpArriving_AttendanceTime',
       'SecondPumpArriving_DeployedFromStation', 'IncidentSeverity',
       'IncidentsDensityByZone', 'HasDelay', 'GeoZonePumpsInteraction',
       'DelayCode_Description', 'NumStationsWithPumpsAttending',
       'SpecialServiceType', 'Easting_m', 'Northing_m', 'IncidentGroup',
       'PropertyType', 'Easting_rounded', 'Northing_rounded',
       'IncidentStationGround', 'StopCodeDescription', 'PlusCode_Description',
       'DelayCodeId', 'PropertyCategory', 'AddressQualifier',
       'AvgResponseTimeByStation', 'DistanceTimeInteraction',
       'AttendanceTimeSeconds', 'HourOfCall_sin', 'HourOfCall_cos',
       'DayOfWeek_sin', 'DayOfWeek_cos', 'Month_sin', 'Month_cos',
       'TimeOfDay'],
      dtype='object')
In [80]:
lfb.info()
<class 'pandas.core.frame.DataFrame'>
Index: 200068 entries, 0 to 585566
Data columns (total 44 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   CalYear                                 200068 non-null  int64  
 1   Geo_zone                                188180 non-null  object 
 2   Categ_geo                               188180 non-null  object 
 3   Latitude_incident                       200068 non-null  float64
 4   Longitude_incident                      200068 non-null  float64
 5   DistanceToIncident_km                   188180 non-null  float64
 6   Latitude_station_deploy                 188180 non-null  float64
 7   Longitude_station_deploy                188180 non-null  float64
 8   NumPumpsAttending                       200068 non-null  float64
 9   PumpOrder                               200068 non-null  int64  
 10  PumpCount                               200068 non-null  int64  
 11  FirstPumpArriving_DeployedFromStation   200064 non-null  object 
 12  FirstPumpArriving_AttendanceTime        200068 non-null  float64
 13  SecondPumpArriving_AttendanceTime       200068 non-null  object 
 14  SecondPumpArriving_DeployedFromStation  200068 non-null  object 
 15  IncidentSeverity                        200068 non-null  float64
 16  IncidentsDensityByZone                  188180 non-null  float64
 17  HasDelay                                200068 non-null  int32  
 18  GeoZonePumpsInteraction                 188180 non-null  float64
 19  DelayCode_Description                   49616 non-null   object 
 20  NumStationsWithPumpsAttending           200068 non-null  float64
 21  SpecialServiceType                      41418 non-null   object 
 22  Easting_m                               200068 non-null  float64
 23  Northing_m                              200068 non-null  float64
 24  IncidentGroup                           200068 non-null  object 
 25  PropertyType                            200068 non-null  object 
 26  Easting_rounded                         200068 non-null  int64  
 27  Northing_rounded                        200068 non-null  int64  
 28  IncidentStationGround                   200066 non-null  object 
 29  StopCodeDescription                     200068 non-null  object 
 30  PlusCode_Description                    200068 non-null  object 
 31  DelayCodeId                             49616 non-null   float64
 32  PropertyCategory                        200068 non-null  object 
 33  AddressQualifier                        200068 non-null  object 
 34  AvgResponseTimeByStation                200066 non-null  float64
 35  DistanceTimeInteraction                 188180 non-null  float64
 36  AttendanceTimeSeconds                   200068 non-null  int64  
 37  HourOfCall_sin                          200068 non-null  float64
 38  HourOfCall_cos                          200068 non-null  float64
 39  DayOfWeek_sin                           200068 non-null  float64
 40  DayOfWeek_cos                           200068 non-null  float64
 41  Month_sin                               200068 non-null  float64
 42  Month_cos                               200068 non-null  float64
 43  TimeOfDay                               200068 non-null  object 
dtypes: float64(22), int32(1), int64(6), object(15)
memory usage: 67.9+ MB
In [81]:
numerical_cols = lfb.select_dtypes(include=['float64', 'int64'])

# Compute the correlation matrix for the numerical columns
correlation_matrix = numerical_cols.corr()

# Set up the matplotlib figure
plt.figure(figsize=(12, 8))

# Draw the heatmap with seaborn
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)

# Add title
plt.title("Correlation Heatmap of Numerical Features")

# Display the plot
plt.show()
No description has been provided for this image

On utilise un échantillon aléatiore de 200 000 individus.

In [42]:
lfb_sampled = lfb.sample(n=20000, random_state=42)

Train test split et Nettoyage¶

In [43]:
X = lfb_sampled.drop(columns=["AttendanceTimeSeconds"])  
y = lfb_sampled["AttendanceTimeSeconds"]

# Séparation en jeu d'entraînement et de test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Nettoyage des colonnes numériques 
for col in X_train.select_dtypes(include=["float64", "int64"]).columns:
    median_value = X_train[col].median()  
    X_train[col].fillna(median_value, inplace=True)
    X_test[col].fillna(median_value, inplace=True)  

# Nettoyage des colonnes catégorielles 
for col in X_train.select_dtypes(include=['object']).columns:
    mode_value = X_train[col].mode()[0] 
    X_train[col].fillna(mode_value, inplace=True)
    X_test[col].fillna(mode_value, inplace=True)  

# Nettoyage des colonnes de dates 
for col in X_train.select_dtypes(include=['datetime64[ns]']).columns:
    default_date = pd.to_datetime('2022-01-05')  
    X_train[col].fillna(default_date, inplace=True)
    X_test[col].fillna(default_date, inplace=True)
In [44]:
display(X_train.info())
display(X_test.info())
<class 'pandas.core.frame.DataFrame'>
Index: 3500 entries, 372482 to 255068
Data columns (total 43 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   CalYear                                 3500 non-null   int64  
 1   Geo_zone                                3500 non-null   object 
 2   Categ_geo                               3500 non-null   object 
 3   Latitude_incident                       3500 non-null   float64
 4   Longitude_incident                      3500 non-null   float64
 5   DistanceToIncident_km                   3500 non-null   float64
 6   Latitude_station_deploy                 3500 non-null   float64
 7   Longitude_station_deploy                3500 non-null   float64
 8   NumPumpsAttending                       3500 non-null   float64
 9   PumpOrder                               3500 non-null   int64  
 10  PumpCount                               3500 non-null   int64  
 11  FirstPumpArriving_DeployedFromStation   3500 non-null   object 
 12  FirstPumpArriving_AttendanceTime        3500 non-null   float64
 13  SecondPumpArriving_AttendanceTime       3500 non-null   object 
 14  SecondPumpArriving_DeployedFromStation  3500 non-null   object 
 15  IncidentSeverity                        3500 non-null   float64
 16  IncidentsDensityByZone                  3500 non-null   float64
 17  HasDelay                                3500 non-null   int32  
 18  GeoZonePumpsInteraction                 3500 non-null   float64
 19  DelayCode_Description                   3500 non-null   object 
 20  NumStationsWithPumpsAttending           3500 non-null   float64
 21  SpecialServiceType                      3500 non-null   object 
 22  Easting_m                               3500 non-null   float64
 23  Northing_m                              3500 non-null   float64
 24  IncidentGroup                           3500 non-null   object 
 25  PropertyType                            3500 non-null   object 
 26  Easting_rounded                         3500 non-null   int64  
 27  Northing_rounded                        3500 non-null   int64  
 28  IncidentStationGround                   3500 non-null   object 
 29  StopCodeDescription                     3500 non-null   object 
 30  PlusCode_Description                    3500 non-null   object 
 31  DelayCodeId                             3500 non-null   float64
 32  PropertyCategory                        3500 non-null   object 
 33  AddressQualifier                        3500 non-null   object 
 34  AvgResponseTimeByStation                3500 non-null   float64
 35  DistanceTimeInteraction                 3500 non-null   float64
 36  HourOfCall_sin                          3500 non-null   float64
 37  HourOfCall_cos                          3500 non-null   float64
 38  DayOfWeek_sin                           3500 non-null   float64
 39  DayOfWeek_cos                           3500 non-null   float64
 40  Month_sin                               3500 non-null   float64
 41  Month_cos                               3500 non-null   float64
 42  TimeOfDay                               3500 non-null   object 
dtypes: float64(22), int32(1), int64(5), object(15)
memory usage: 1.2+ MB
None
<class 'pandas.core.frame.DataFrame'>
Index: 1500 entries, 141376 to 421214
Data columns (total 43 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   CalYear                                 1500 non-null   int64  
 1   Geo_zone                                1500 non-null   object 
 2   Categ_geo                               1500 non-null   object 
 3   Latitude_incident                       1500 non-null   float64
 4   Longitude_incident                      1500 non-null   float64
 5   DistanceToIncident_km                   1500 non-null   float64
 6   Latitude_station_deploy                 1500 non-null   float64
 7   Longitude_station_deploy                1500 non-null   float64
 8   NumPumpsAttending                       1500 non-null   float64
 9   PumpOrder                               1500 non-null   int64  
 10  PumpCount                               1500 non-null   int64  
 11  FirstPumpArriving_DeployedFromStation   1500 non-null   object 
 12  FirstPumpArriving_AttendanceTime        1500 non-null   float64
 13  SecondPumpArriving_AttendanceTime       1500 non-null   object 
 14  SecondPumpArriving_DeployedFromStation  1500 non-null   object 
 15  IncidentSeverity                        1500 non-null   float64
 16  IncidentsDensityByZone                  1500 non-null   float64
 17  HasDelay                                1500 non-null   int32  
 18  GeoZonePumpsInteraction                 1500 non-null   float64
 19  DelayCode_Description                   1500 non-null   object 
 20  NumStationsWithPumpsAttending           1500 non-null   float64
 21  SpecialServiceType                      1500 non-null   object 
 22  Easting_m                               1500 non-null   float64
 23  Northing_m                              1500 non-null   float64
 24  IncidentGroup                           1500 non-null   object 
 25  PropertyType                            1500 non-null   object 
 26  Easting_rounded                         1500 non-null   int64  
 27  Northing_rounded                        1500 non-null   int64  
 28  IncidentStationGround                   1500 non-null   object 
 29  StopCodeDescription                     1500 non-null   object 
 30  PlusCode_Description                    1500 non-null   object 
 31  DelayCodeId                             1500 non-null   float64
 32  PropertyCategory                        1500 non-null   object 
 33  AddressQualifier                        1500 non-null   object 
 34  AvgResponseTimeByStation                1500 non-null   float64
 35  DistanceTimeInteraction                 1500 non-null   float64
 36  HourOfCall_sin                          1500 non-null   float64
 37  HourOfCall_cos                          1500 non-null   float64
 38  DayOfWeek_sin                           1500 non-null   float64
 39  DayOfWeek_cos                           1500 non-null   float64
 40  Month_sin                               1500 non-null   float64
 41  Month_cos                               1500 non-null   float64
 42  TimeOfDay                               1500 non-null   object 
dtypes: float64(22), int32(1), int64(5), object(15)
memory usage: 509.8+ KB
None

Encodage¶

In [45]:
# Variables numériques et catégorielles
numerical_cols = X_train.select_dtypes(include=["int64", "float64"]).columns
categorical_cols = X_train.select_dtypes(include=["object"]).columns

# Convertir toutes les colonnes catégorielles en type string 
X_train[categorical_cols] = X_train[categorical_cols].astype(str)
X_test[categorical_cols] = X_test[categorical_cols].astype(str)

# Encodage des variables catégorielles avec OneHotEncoder (en ignorant les catégories inconnues dans le test)
encoder = OneHotEncoder(drop="first", sparse=False, handle_unknown="ignore")
X_train_encoded = encoder.fit_transform(X_train[categorical_cols])
X_test_encoded = encoder.transform(X_test[categorical_cols])

# Ajouter les colonnes encodées dans les DataFrames d'origine
X_train_encoded = np.concatenate([X_train.drop(columns=categorical_cols).values, X_train_encoded], axis=1)
X_test_encoded = np.concatenate([X_test.drop(columns=categorical_cols).values, X_test_encoded], axis=1)

# Normalisation des variables numériques
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train_encoded)
X_test_sc = scaler.transform(X_test_encoded)

# Vérifier la forme des matrices après encodage et normalisation
print(f"X_train shape: {X_train_sc.shape}")
print(f"X_test shape: {X_test_sc.shape}")
X_train shape: (3500, 1082)
X_test shape: (1500, 1082)
In [46]:
print("Nombre de colonnes numériques :", len(numerical_cols))
print("Nombre de colonnes encodées :", len(encoder.get_feature_names_out(categorical_cols)))
print("Nombre total de colonnes attendues :", len(numerical_cols) + len(encoder.get_feature_names_out(categorical_cols)))
print("Nombre total de colonnes dans X_train_sc :", X_train_sc.shape[1])
Nombre de colonnes numériques : 27
Nombre de colonnes encodées : 1054
Nombre total de colonnes attendues : 1081
Nombre total de colonnes dans X_train_sc : 1082
In [48]:
# Vérifier et ajuster les colonnes si nécessaire
X_train_df = pd.DataFrame(X_train_sc[:, :len(numerical_cols) + len(encoder.get_feature_names_out(categorical_cols))], 
                          columns=numerical_cols.tolist() + encoder.get_feature_names_out(categorical_cols).tolist())

X_train_df.to_csv('X_train_encoded_5000.csv', index=False)

Random Forest avec recherche des hyperparamètres¶

In [91]:
rf = RandomForestRegressor(random_state=42)

# Définition des paramètres à tester
param_rf = {
    "max_features": ["sqrt", "log2"],
    "min_samples_split": np.arange(2, 20, 2)}

# Utilisation de RandomizedSearchCV avec un nombre limité d'itérations
random_search_rf = RandomizedSearchCV(rf, param_rf, cv=5, scoring="neg_mean_squared_error", n_iter=10, n_jobs=-1)
random_search_rf.fit(X_train_sc, y_train)

# Meilleurs paramètres
best_param_rf = random_search_rf.best_params_
print("Meilleurs paramètres :", best_param_rf)

# Entraînement avec les meilleurs hyperparamètres
rf_best = RandomForestRegressor(**best_param_rf, random_state=42)
rf_best.fit(X_train_sc, y_train)

# Prédiction et évaluation
y_pred_rf_best = rf_best.predict(X_test_sc)

# Évaluation des performances du modèle
rmse_final = np.sqrt(mean_squared_error(y_test, y_pred_rf_best))
mse_final = mean_squared_error(y_test, y_pred_rf_best)
mae_final = mean_absolute_error(y_test, y_pred_rf_best)
r2_final = r2_score(y_test, y_pred_rf_best)
medae_final = median_absolute_error(y_test, y_pred_rf_best)

print("RMSE:", rmse_final)
print("R²:", r2_final)
print("MAE:", mae_final)
print("MSE:", mse_final)
print("MedAE:", medae_final)
Meilleurs paramètres : {'min_samples_split': 2, 'max_features': 'sqrt'}
RMSE: 54.7967637493585
R²: 0.7902576593236635
MAE: 37.484719444444444
MSE: 3002.6853174030093
MedAE: 25.144999999999982
In [92]:
importances = rf_best.feature_importances_

# Récupération des noms de colonnes avant encodage
feature_names = np.concatenate([X_train.drop(columns=categorical_cols).columns, encoder.get_feature_names_out(categorical_cols)])

# importance des caractéristiques (en conservant seulement les 10 plus importantes)
sorted_indices = np.argsort(importances)[::-1][:10] 
plt.figure(figsize=(10, 6))
plt.title("Top 10 des caractéristiques les plus importantes")
plt.barh(range(len(sorted_indices)), importances[sorted_indices], align="center", color="skyblue")
plt.yticks(range(len(sorted_indices)), feature_names[sorted_indices])
plt.gca().invert_yaxis()
plt.xlabel("Importance des caractéristiques")
plt.show()
No description has been provided for this image
In [93]:
feature_importance = pd.DataFrame({
    "Feature": feature_names,
    "Importance": importances*100})

# Tri par importance décroissante
feature_importance = feature_importance.sort_values(by="Importance", ascending=False)

# Afficher les importances des caractéristiques
print(feature_importance.head(20))
                                           Feature  Importance
9                 FirstPumpArriving_AttendanceTime   19.792545
12                                        HasDelay   12.917602
3                            DistanceToIncident_km    7.210165
21                         DistanceTimeInteraction    6.322682
927              DelayCode_Description_Not held up    2.771588
7                                        PumpOrder    2.646140
19                                     DelayCodeId    2.330914
10                                IncidentSeverity    2.073542
2                               Longitude_incident    1.545136
15                                       Easting_m    1.469373
16                                      Northing_m    1.442884
1                                Latitude_incident    1.432779
17                                 Easting_rounded    1.403299
18                                Northing_rounded    1.387039
20                        AvgResponseTimeByStation    1.353670
930  DelayCode_Description_Traffic, roadworks, etc    1.292071
14                   NumStationsWithPumpsAttending    1.176261
13                         GeoZonePumpsInteraction    1.020189
5                         Longitude_station_deploy    0.993469
23                                  HourOfCall_cos    0.928109
In [94]:
# 1. Graphique : Comparaison des valeurs réelles et prédites
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_rf_best, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "k--", lw=2)
plt.title("Comparaison des valeurs réelles vs prédites")
plt.xlabel("Valeurs réelles")
plt.ylabel("Valeurs prédites")
plt.grid(True)
plt.show()

# 2. Graphique : Histogramme des erreurs (résidus)
errors = y_test - y_pred_rf_best
plt.figure(figsize=(10, 6))
sns.histplot(errors, bins=30, kde=True, color="purple")
plt.title("Distribution des erreurs (résidus)")
plt.xlabel("Erreur")
plt.ylabel("Fréquence")
plt.grid(True)
plt.show()

# 3. Graphique : Analyse des résidus (résidus vs valeurs prédites)
plt.figure(figsize=(10, 6))
plt.scatter(y_pred_rf_best, errors, alpha=0.6)
plt.axhline(y=0, color="r", linestyle="--")
plt.title("Analyse des résidus")
plt.xlabel("Valeurs prédites")
plt.ylabel("Résidus")
plt.grid(True)
plt.show()

# 4. Graphique : Courbe des résidus cumulés
plt.figure(figsize=(10, 6))
sns.ecdfplot(errors, color="green")
plt.title("Courbe des résidus cumulés")
plt.xlabel("Erreur")
plt.ylabel("Cumul")
plt.grid(True)
plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Random Forest min_samples_split=2, max_features='sqrt'¶

In [99]:
rf = RandomForestRegressor(min_samples_split=2, max_features='sqrt', random_state=42)

# Entraînement du modèle
rf.fit(X_train_sc, y_train)

# Prédiction sur le jeu de test
y_pred_rf = rf.predict(X_test_sc)

# Évaluation des performances du modèle
rmse_final = np.sqrt(mean_squared_error(y_test, y_pred_rf))
mse_final = mean_squared_error(y_test, y_pred_rf)
mae_final = mean_absolute_error(y_test, y_pred_rf)
r2_final = r2_score(y_test, y_pred_rf)
medae_final = median_absolute_error(y_test, y_pred_rf)

# Affichage des résultats
print("RMSE:", rmse_final)
print("R²:", r2_final)
print("MAE:", mae_final)
print("MSE:", mse_final)
print("MedAE:", medae_final)
RMSE: 47.45750154699462
R²: 0.843668408649996
MAE: 31.98477698921079
MSE: 2252.214453082997
MedAE: 21.379999999999995
In [100]:
importances = rf.feature_importances_

# Récupération des noms de colonnes avant encodage
feature_names = np.concatenate([X_train.drop(columns=categorical_cols).columns, encoder.get_feature_names_out(categorical_cols)])

# importance des caractéristiques (en conservant seulement les 10 plus importantes)
sorted_indices = np.argsort(importances)[::-1][:10] 
plt.figure(figsize=(10, 6))
plt.title("Top 10 des caractéristiques les plus importantes")
plt.barh(range(len(sorted_indices)), importances[sorted_indices], align="center", color="skyblue")
plt.yticks(range(len(sorted_indices)), feature_names[sorted_indices])
plt.gca().invert_yaxis()
plt.xlabel("Importance des caractéristiques")
plt.show()
No description has been provided for this image
In [101]:
# 1. Graphique : Comparaison des valeurs réelles et prédites
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_rf, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "k--", lw=2)
plt.title("Comparaison des valeurs réelles vs prédites")
plt.xlabel("Valeurs réelles")
plt.ylabel("Valeurs prédites")
plt.grid(True)
plt.show()

# 2. Graphique : Histogramme des erreurs (résidus)
errors = y_test - y_pred_rf
plt.figure(figsize=(10, 6))
sns.histplot(errors, bins=30, kde=True, color="purple")
plt.title("Distribution des erreurs (résidus)")
plt.xlabel("Erreur")
plt.ylabel("Fréquence")
plt.grid(True)
plt.show()

# 3. Graphique : Analyse des résidus (résidus vs valeurs prédites)
plt.figure(figsize=(10, 6))
plt.scatter(y_pred_rf, errors, alpha=0.6)
plt.axhline(y=0, color="r", linestyle="--")
plt.title("Analyse des résidus")
plt.xlabel("Valeurs prédites")
plt.ylabel("Résidus")
plt.grid(True)
plt.show()

# 4. Graphique : Courbe des résidus cumulés
plt.figure(figsize=(10, 6))
sns.ecdfplot(errors, color="green")
plt.title("Courbe des résidus cumulés")
plt.xlabel("Erreur")
plt.ylabel("Cumul")
plt.grid(True)
plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Regression de Lasso avec recherche des hyperparamètres¶

In [40]:
from sklearn.linear_model import Lasso

lasso = Lasso(max_iter=5000, random_state=42)

param_lasso = {"alpha": np.logspace(-3, 0, 20)}

random_search_lasso = RandomizedSearchCV(lasso, param_lasso, cv=5, scoring='neg_mean_squared_error', n_iter=10, n_jobs=-1, random_state=42)
random_search_lasso.fit(X_train_sc, y_train)

# Meilleurs paramètres
best_param_lasso = random_search_lasso.best_params_
print("Meilleurs paramètres pour Lasso:", best_param_lasso)

lasso_best = Lasso(**best_param_lasso, max_iter=5000, random_state=42)
lasso_best.fit(X_train_sc, y_train)

y_pred_lasso_best = lasso_best.predict(X_test_sc)

rmse_lasso = np.sqrt(mean_squared_error(y_test, y_pred_lasso_best))
r2_lasso = r2_score(y_test, y_pred_lasso_best)
mae_lasso = mean_absolute_error(y_test, y_pred_lasso_best)
mse_lasso = mean_squared_error(y_test, y_pred_lasso_best)
medae_lasso = median_absolute_error(y_test, y_pred_lasso_best)

print("RMSE:", rmse_lasso)
print("R²:", r2_lasso)
print("MAE:", mae_lasso)
print("MSE:", mse_lasso)
print("MedAE:", medae_lasso)
Meilleurs paramètres pour Lasso: {'alpha': 0.3359818286283781}
RMSE: 40.623398953392105
R²: 0.878737401304899
MAE: 24.94076159010507
MSE: 1650.2605425264587
MedAE: 15.877268590254744
In [232]:
# 1. Graphique : Comparaison des valeurs réelles et prédites pour le modèle Lasso
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_lasso_best, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "k--", lw=2)
plt.title("Comparaison des valeurs réelles vs prédites (Modèle Lasso)")
plt.xlabel("Valeurs réelles")
plt.ylabel("Valeurs prédites")
plt.grid(True)
plt.show()

# 2. Graphique : Histogramme des erreurs (résidus) pour le modèle Lasso
errors = y_test - y_pred_lasso_best
plt.figure(figsize=(10, 6))
sns.histplot(errors, bins=30, kde=True, color="purple")
plt.title("Distribution des erreurs (résidus) (Modèle Lasso)")
plt.xlabel("Erreur")
plt.ylabel("Fréquence")
plt.grid(True)
plt.show()

# 3. Graphique : Analyse des résidus (résidus vs valeurs prédites) pour le modèle Lasso
plt.figure(figsize=(10, 6))
plt.scatter(y_pred_lasso_best, errors, alpha=0.6)
plt.axhline(y=0, color="r", linestyle="--")
plt.title("Analyse des résidus (Modèle Lasso)")
plt.xlabel("Valeurs prédites")
plt.ylabel("Résidus")
plt.grid(True)
plt.show()

# 4. Graphique : Courbe des résidus cumulés pour le modèle Lasso
plt.figure(figsize=(10, 6))
sns.ecdfplot(errors, color="green")
plt.title("Courbe des résidus cumulés (Modèle Lasso)")
plt.xlabel("Erreur")
plt.ylabel("Cumul")
plt.grid(True)
plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Regression de lasso avec alpha = 0.3¶

In [102]:
lasso_model = Lasso(alpha=0.3, max_iter=10000, random_state=42)

lasso_model.fit(X_train_sc, y_train)
y_pred_lasso = lasso_model.predict(X_test_sc)

rmse_lasso = np.sqrt(mean_squared_error(y_test, y_pred_lasso))
r2_lasso = r2_score(y_test, y_pred_lasso)
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
medae_lasso = median_absolute_error(y_test, y_pred_lasso)

print("Lasso Regression (alpha = 0.3) Performances :")
print("RMSE:", rmse_lasso)
print("R²:", r2_lasso)
print("MAE:", mae_lasso)
print("MSE:", mse_lasso)
print("MedAE:", medae_lasso)
Lasso Regression (alpha = 0.3) Performances :
RMSE: 40.50728999139992
R²: 0.8861053338853838
MAE: 25.190362094298372
MSE: 1640.8405424473685
MedAE: 16.742184218489854
In [ ]:
import joblib
joblib.dump(lasso_model, 'model_lasso.pkl')
In [103]:
importances = np.abs(lasso_model.coef_)

# Récupération des noms de colonnes avant encodage
feature_names = np.concatenate([X_train.drop(columns=categorical_cols).columns, encoder.get_feature_names_out(categorical_cols)])

# Importance des caractéristiques (en conservant seulement les 10 plus importantes)
sorted_indices = np.argsort(importances)[::-1][:10]
plt.figure(figsize=(10, 6))
plt.title("Top 10 des caractéristiques les plus importantes")
plt.barh(range(len(sorted_indices)), importances[sorted_indices], align="center", color="skyblue")
plt.yticks(range(len(sorted_indices)), feature_names[sorted_indices])
plt.gca().invert_yaxis()
plt.xlabel("Importance des caractéristiques")
plt.show()
No description has been provided for this image
In [104]:
feature_importance = pd.DataFrame({
    "Feature": feature_names,
    "Importance": importances})

# Tri par importance décroissante
feature_importance = feature_importance.sort_values(by="Importance", ascending=False)

# Afficher les importances des caractéristiques
print(feature_importance.head(20))
                                                Feature  Importance
9                      FirstPumpArriving_AttendanceTime   84.308936
7                                             PumpOrder   48.775374
12                                             HasDelay   22.231762
6                                     NumPumpsAttending   17.260384
14                        NumStationsWithPumpsAttending   12.419868
1201  SecondPumpArriving_AttendanceTime_Pas de secon...    7.053949
1405                         PropertyType_Fire station     2.852885
459               SecondPumpArriving_AttendanceTime_3.0    2.065155
20                             AvgResponseTimeByStation    1.915488
680               SecondPumpArriving_AttendanceTime_5.0    1.826000
1321                SpecialServiceType_Medical Incident    1.815927
569               SecondPumpArriving_AttendanceTime_4.0    1.641104
1010              SecondPumpArriving_AttendanceTime_8.0    1.555689
349               SecondPumpArriving_AttendanceTime_2.0    1.379268
1122             SecondPumpArriving_AttendanceTime_91.0    1.189987
372              SecondPumpArriving_AttendanceTime_22.0    1.169444
1132            SecondPumpArriving_AttendanceTime_923.0    1.113083
790               SecondPumpArriving_AttendanceTime_6.0    1.105995
243             SecondPumpArriving_AttendanceTime_116.0    1.030249
536              SecondPumpArriving_AttendanceTime_37.0    1.002388
In [105]:
# 1. Graphique : Comparaison des valeurs réelles et prédites pour le modèle Lasso
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_lasso, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "k--", lw=2)
plt.title("Comparaison des valeurs réelles vs prédites (Modèle Lasso)")
plt.xlabel("Valeurs réelles")
plt.ylabel("Valeurs prédites")
plt.grid(True)
plt.show()

# 2. Graphique : Histogramme des erreurs (résidus) pour le modèle Lasso
errors = y_test - y_pred_lasso
plt.figure(figsize=(10, 6))
sns.histplot(errors, bins=30, kde=True, color="purple")
plt.title("Distribution des erreurs (résidus) (Modèle Lasso)")
plt.xlabel("Erreur")
plt.ylabel("Fréquence")
plt.grid(True)
plt.show()

# 3. Graphique : Analyse des résidus (résidus vs valeurs prédites) pour le modèle Lasso
plt.figure(figsize=(10, 6))
plt.scatter(y_pred_lasso, errors, alpha=0.6)
plt.axhline(y=0, color="r", linestyle="--")
plt.title("Analyse des résidus (Modèle Lasso)")
plt.xlabel("Valeurs prédites")
plt.ylabel("Résidus")
plt.grid(True)
plt.show()

# 4. Graphique : Courbe des résidus cumulés pour le modèle Lasso
plt.figure(figsize=(10, 6))
sns.ecdfplot(errors, color="green")
plt.title("Courbe des résidus cumulés (Modèle Lasso)")
plt.xlabel("Erreur")
plt.ylabel("Cumul")
plt.grid(True)
plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

ElasticNet avec recherche des hyperparamètres¶

In [37]:
from sklearn.linear_model import ElasticNet

elastic_net = ElasticNet(max_iter=5000, random_state=42)

# Définition des paramètres à tester pour ElasticNet (valeurs d'alpha et de l1_ratio)
param_elastic_net = {
    "alpha": np.logspace(-3, 0, 20),
    "l1_ratio": np.linspace(0.1, 1, 10)}

# Chercher les meilleurs paramètres
random_search_elastic_net = RandomizedSearchCV(elastic_net, param_elastic_net, cv=5, scoring='neg_mean_squared_error', n_iter=10, n_jobs=-1, random_state=42)
random_search_elastic_net.fit(X_train_sc, y_train)

# Meilleurs paramètres
best_param_elastic_net = random_search_elastic_net.best_params_
print("Meilleurs paramètres pour ElasticNet:", best_param_elastic_net)

# Train
elastic_net_best = ElasticNet(**best_param_elastic_net, max_iter=5000, random_state=42)
elastic_net_best.fit(X_train_sc, y_train)

y_pred_elastic_net_best = elastic_net_best.predict(X_test_sc)

rmse_elastic_net = np.sqrt(mean_squared_error(y_test, y_pred_elastic_net_best))
r2_elastic_net = r2_score(y_test, y_pred_elastic_net_best)
mae_elastic_net = mean_absolute_error(y_test, y_pred_elastic_net_best)
mse_elastic_net = mean_squared_error(y_test, y_pred_elastic_net_best)
medae_elastic_net = median_absolute_error(y_test, y_pred_elastic_net_best)

print("RMSE:", rmse_elastic_net)
print("R²:", r2_elastic_net)
print("MAE:", mae_elastic_net)
print("MSE:", mse_elastic_net)
print("MedAE:", medae_elastic_net)
Meilleurs paramètres pour ElasticNet: {'l1_ratio': 0.9, 'alpha': 0.23357214690901212}
RMSE: 42.82104246876511
R²: 0.8740398176602928
MAE: 26.551892074115237
MSE: 1833.641678111785
MedAE: 17.43072048605191
In [43]:
importances = np.abs(elastic_net_best.coef_)

# Récupération des noms des colonnes avant encodage
feature_names = np.concatenate([X_train.drop(columns=categorical_cols).columns, encoder.get_feature_names_out(categorical_cols)])

# Importance des caractéristiques (en conservant seulement les 10 plus importantes)
sorted_indices = np.argsort(importances)[::-1][:10]
plt.figure(figsize=(10, 6))
plt.title("Top 10 des caractéristiques les plus importantes (ElasticNet)")
plt.barh(range(len(sorted_indices)), importances[sorted_indices], align="center", color="skyblue")
plt.yticks(range(len(sorted_indices)), feature_names[sorted_indices])
plt.gca().invert_yaxis()
plt.xlabel("Importance des caractéristiques")
plt.show()
No description has been provided for this image
In [39]:
# 1. Comparaison des valeurs réelles et prédites
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_elastic_net_best, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "k--", lw=2)
plt.title("Comparaison des valeurs réelles vs prédites (ElasticNet)")
plt.xlabel("Valeurs réelles")
plt.ylabel("Valeurs prédites")
plt.grid(True)
plt.show()

# 2. Histogramme des erreurs (résidus)
errors = y_test - y_pred_elastic_net_best
plt.figure(figsize=(10, 6))
sns.histplot(errors, bins=30, kde=True, color="purple")
plt.title("Distribution des erreurs (résidus) (ElasticNet)")
plt.xlabel("Erreur")
plt.ylabel("Fréquence")
plt.grid(True)
plt.show()

# 3. Analyse des résidus
plt.figure(figsize=(10, 6))
plt.scatter(y_pred_elastic_net_best, errors, alpha=0.6)
plt.axhline(y=0, color="r", linestyle="--")
plt.title("Analyse des résidus (ElasticNet)")
plt.xlabel("Valeurs prédites")
plt.ylabel("Résidus")
plt.grid(True)
plt.show()

# 4. Courbe des résidus cumulés
plt.figure(figsize=(10, 6))
sns.ecdfplot(errors, color="green")
plt.title("Courbe des résidus cumulés (ElasticNet)")
plt.xlabel("Erreur")
plt.ylabel("Cumul")
plt.grid(True)
plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Elastic NET¶

In [49]:
# Initialisation du modèle ElasticNet avec les hyperparamètres définis manuellement
elastic_net = ElasticNet(alpha=0.233, l1_ratio=0.9, max_iter=5000, random_state=42)

# Entraînement du modèle
elastic_net.fit(X_train_sc, y_train)

# Prédictions sur le jeu de test
y_pred_elastic_net = elastic_net.predict(X_test_sc)

# Calcul des métriques de performance
rmse_elastic_net = np.sqrt(mean_squared_error(y_test, y_pred_elastic_net))
r2_elastic_net = r2_score(y_test, y_pred_elastic_net)
mae_elastic_net = mean_absolute_error(y_test, y_pred_elastic_net)
mse_elastic_net = mean_squared_error(y_test, y_pred_elastic_net)
medae_elastic_net = median_absolute_error(y_test, y_pred_elastic_net)

# Affichage des résultats
print("ElasticNet Regression Performances :")
print("RMSE:", rmse_elastic_net)
print("R²:", r2_elastic_net)
print("MAE:", mae_elastic_net)
print("MSE:", mse_elastic_net)
print("MedAE:", medae_elastic_net)
ElasticNet Regression Performances :
RMSE: 48.65948021184131
R²: 0.8362127165995853
MAE: 29.607528875843485
MSE: 2367.7450144865766
MedAE: 17.76480954934665
In [50]:
import joblib
joblib.dump(elastic_net, 'model_elasticnet_5000.pkl')
Out[50]:
['model_elasticnet_5000.pkl']
In [109]:
importances = np.abs(elastic_net.coef_)

# Récupération des noms des colonnes avant encodage
feature_names = np.concatenate([X_train.drop(columns=categorical_cols).columns, encoder.get_feature_names_out(categorical_cols)])

# Importance des caractéristiques (en conservant seulement les 10 plus importantes)
sorted_indices = np.argsort(importances)[::-1][:10]
plt.figure(figsize=(10, 6))
plt.title("Top 10 des caractéristiques les plus importantes (ElasticNet)")
plt.barh(range(len(sorted_indices)), importances[sorted_indices], align="center", color="skyblue")
plt.yticks(range(len(sorted_indices)), feature_names[sorted_indices])
plt.gca().invert_yaxis()
plt.xlabel("Importance des caractéristiques")
plt.show()
No description has been provided for this image
In [110]:
# 1. Comparaison des valeurs réelles et prédites
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_elastic_net, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "k--", lw=2)
plt.title("Comparaison des valeurs réelles vs prédites (ElasticNet)")
plt.xlabel("Valeurs réelles")
plt.ylabel("Valeurs prédites")
plt.grid(True)
plt.show()

# 2. Histogramme des erreurs (résidus)
errors = y_test - y_pred_elastic_net
plt.figure(figsize=(10, 6))
sns.histplot(errors, bins=30, kde=True, color="purple")
plt.title("Distribution des erreurs (résidus) (ElasticNet)")
plt.xlabel("Erreur")
plt.ylabel("Fréquence")
plt.grid(True)
plt.show()

# 3. Analyse des résidus
plt.figure(figsize=(10, 6))
plt.scatter(y_pred_elastic_net, errors, alpha=0.6)
plt.axhline(y=0, color="r", linestyle="--")
plt.title("Analyse des résidus (ElasticNet)")
plt.xlabel("Valeurs prédites")
plt.ylabel("Résidus")
plt.grid(True)
plt.show()

# 4. Courbe des résidus cumulés
plt.figure(figsize=(10, 6))
sns.ecdfplot(errors, color="green")
plt.title("Courbe des résidus cumulés (ElasticNet)")
plt.xlabel("Erreur")
plt.ylabel("Cumul")
plt.grid(True)
plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [ ]: